cor2cov = function(R, std) {
  Sigma = outer(std, std) * R
  return(Sigma)
}

get_upper_tri = function(cormat){
    cormat[lower.tri(cormat)] = NA
    diag(cormat) = NA
    return(cormat)
}

p_dens = function(p_mat) {
  p_vec = p_mat[upper.tri(p_mat)]
  
  df_line = data.frame(xintercept = c(0.01, 0.05), 
                       Lines = c("p = 0.01", "p = 0.05"),
                       stringsAsFactors = FALSE)
  
  fig_p = data.frame(p = p_vec) %>%
    ggplot(aes(x = p)) +
    stat_ecdf(geom = "point") +
    geom_vline(aes(xintercept = xintercept, color = Lines), df_line, 
               linetype = "dashed", size = 1) +
    labs(x = "P-value", y = "Cumulative density") +
    scale_color_discrete(name = NULL) +
    theme_bw()
  return(fig_p)
}

p_filter = function(mat, mat_p, max_p){
  ind_p = mat_p
  ind_p[mat_p > max_p] = 0
  ind_p[mat_p <= max_p] = 1
  
  mat_filter = mat * ind_p
  return(mat_filter)
}

hard_thresh = function(R, th){
  R_th = R
  R_th[abs(R) <= th] = 0
  return(R_th)
}

1. Data generation

1.1 Absolute abundances

set.seed(123)
n = 50
d = 100
mu = c(rep(NA, 4), # Taxa with correlations
       10000, 10000, # High abundant taxa
       sample(c(200, 50), d - 6, replace = TRUE, prob = c(0.8, 0.2))) # Low abundant taxa
# NB distribution
dispersion = 0.5

# Absolute abundances
A = matrix(NA, ncol = n, nrow = d)
for (i in seq.int(from = 5, to = d)) {
  A[i, ] = rnbinom(n = n, size = 1/dispersion, mu = mu[i])
}

# Dependent pairs of taxa
abn_mean = c(2000, 2000)
template_var = log((abn_mean + dispersion * abn_mean^2)/abn_mean^2 + 1)
template_mean = log(abn_mean) - template_var/2

Sigma = cor2cov(R = diag(1, nrow = 2), std = sqrt(template_var))
template = t(MASS::mvrnorm(n = n, mu = template_mean, Sigma = Sigma))
template1 = template[1, ]
template2 = template[2, ]
poly1 = template_mean[1] * poly(x = template1, degree = 1, raw = FALSE)[, 1] + template_mean[1]
poly2 = template_mean[2] * poly(x = template2, degree = 2, raw = FALSE)[, 2] + template_mean[2]
A_dep = round(exp(rbind(template, poly1, poly2)))

for (i in 1:4) {
  A[i, ] = A_dep[i, ]
}
mu[1:2] = 2000
mu[3:4] = 10000

taxa_id = paste0("T", seq_len(d))
sample_id = paste0("S", seq_len(n))
rownames(A) = taxa_id
colnames(A) = sample_id

p_a = data.frame(t(log(A[1:5, ]))) %>% 
  ggpairs(title = "Synthetic Log Absolute Abundances",
          progress = FALSE,
          upper = NULL, diag = NULL) +
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5),
        strip.background = element_rect(fill = "white"))
print(p_a)

1.2 Observed abundances

# Sequencing efficiency
C = rbeta(n = d, shape1 = 5, shape2 = 5)

# Microbial loads in the ecosystem
A_prim = A * C
A_dot = colSums(A_prim)

# Relative abundances in the ecosystem
R = A_prim/t(replicate(d, A_dot))

# Sampling fractions
S = rbeta(n = n, shape1 = 2, shape2 = 10)

# Library sizes
O_dot = round(S * A_dot)
# Observed abundances
O = matrix(NA, nrow = d, ncol = n)
for (i in seq(n)) {
  O[, i] = rmultinom(1, size = O_dot[i], prob = R[, i])
}
rownames(O) = taxa_id
colnames(O) = sample_id

# Random errors
E = O/(A * C * t(replicate(d, S)))

# Log scale parameters
a = log(A)
c = log(C)
s = log(S)
e = log(E)
# Log scale variables
o = log(O)
r = O/t(replicate(d, O_dot))

p_o = data.frame(t(log(O[1:5, ]))) %>% 
  ggpairs(title = "Synthetic Log Observed Abundances",
          progress = FALSE, upper = NULL, diag = NULL) +
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5),
        strip.background = element_rect(fill = "white"))
print(p_o)

1.3 Example: taxon 1 vs. taxon 2

# Absolute abundance
A_t12 = data.frame(t(A[1:2, ]))
ls_cor_test = cor.test(x = A_t12$T1, y = A_t12$T2, method = "pearson")
df_ann = data.frame(x = 4000, y = 4000) %>%
  mutate(label = paste0("r = ", round(ls_cor_test$estimate, 2),
                        "\n", "p = ", round(ls_cor_test$p.value, 2)))
p_A_t12 = A_t12 %>%
  ggplot(aes(x = T1, y = T2)) +
  geom_point(size = 4, alpha = 0.6) +
  labs(title = "Absolute Abundances for T1 and T2") +
  geom_label(data = df_ann, aes(x = x, y = y, label = label), 
             size = 6, vjust = -0.5, hjust = 0, color = "black") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

# Observed abundance
O_t12 = data.frame(t(O[1:2, ]))
ls_cor_test = cor.test(x = O_t12$T1, y = O_t12$T2, method = "pearson")
df_ann = data.frame(x = 600, y = 710) %>%
  mutate(label = paste0("r = ", round(ls_cor_test$estimate, 2),
                        "\n", "p = ", round(ls_cor_test$p.value, 2)))
p_O_t12 = O_t12 %>%
  ggplot(aes(x = T1, y = T2)) +
  geom_point(size = 4, alpha = 0.6) +
  labs(title = "Observed Abundances for T1 and T2") +
  geom_label(data = df_ann, aes(x = x, y = y, label = label), 
             size = 6, vjust = -0.5, hjust = 0, color = "black") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

ggarrange(p_A_t12, p_O_t12, ncol = 2)

2. Standard Pearson correlation

2.1 Correlation matrix

o[is.infinite(o)] = NA
res_sample = Hmisc::rcorr(t(o), type = "pearson")
corr_sample = res_sample$r
corr_sample_p = res_sample$P
diag(corr_sample_p) = 0

corr_sample_fl = p_filter(corr_sample, corr_sample_p, max_p = 0.005)

df = corr_sample_fl[1:5, 1:5]
df_p = data.frame(get_upper_tri(df)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))

p_sample = df_p %>%
  ggplot(aes(var1, var2, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, position = "top") +
  scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
  geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "Standard Pearson") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
        axis.text.y = element_text(size = 12),
        plot.title = element_text(hjust = 0.5),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = c(0.7, 0.8),
        legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5)) +
  coord_fixed()

p_sample

2.2 P-value

p_dense_sample = p_dens(corr_sample_p)
p_dense_sample = p_dense_sample + 
  labs(title = "Standard Pearson") +
  theme(plot.title = element_text(hjust = 0.5))

ggplotly(p_dense_sample)

3. SparCC

corr_sparcc = sparcc(t(O))$Cor
colnames(corr_sparcc) = rownames(O)
rownames(corr_sparcc) = rownames(O)

corr_sparcc_th = hard_thresh(corr_sparcc, th = 0.3)

df = corr_sparcc_th[1:5, 1:5]
df_p = data.frame(get_upper_tri(df)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))

p_sparcc = df_p %>%
  ggplot(aes(var1, var2, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, position = "top") +
  scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
  geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "SparCC") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
        axis.text.y = element_text(size = 12),
        plot.title = element_text(hjust = 0.5),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = c(0.7, 0.8),
        legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5)) +
  coord_fixed()

p_sparcc

4. SECOM

OTU = otu_table(O, taxa_are_rows = TRUE)
meta_data = data.frame(sample_id = sample_id)
META = sample_data(meta_data)
sample_names(META) = meta_data$sample_id
otu_data = phyloseq(OTU, META)

pseqs = list(c(otu_data, otu_data))
pseudo = 0; zero_cut = 0.5; corr_cut = 0.5; lib_cut = 1000
wins_quant = c(0.05, 0.95); method = "pearson"; soft = FALSE; thresh_len = 20
n_cv = 10; seed = 123; thresh_hard = 0.3; max_p = 0.005; n_cl = 5

res_linear = secom_linear(pseqs, pseudo, zero_cut, corr_cut, lib_cut, 
                          wins_quant, method, soft, thresh_len, n_cv, 
                          seed, thresh_hard, max_p, n_cl)

R = 1000; max_p = 0.005
res_dist = secom_dist(pseqs, pseudo, zero_cut, corr_cut, lib_cut, 
                      wins_quant, R, seed, max_p, n_cl)

4.1 Bias-corrected abundance estimation

y_hat = res_linear$y_hat

p_y_hat = data.frame(t(y_hat[1:5, ])) %>% 
  ggpairs(title = "Bias-Corrected Abundances", 
          progress = FALSE, upper = NULL, diag = NULL) +
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5),
        strip.background = element_rect(fill = "white"))

print(p_y_hat)

4.2 Linear correlation

4.21 Thresholding

row_ind = which(rownames(res_linear$corr_th) %in% paste0("T", 1:5))
col_ind = which(colnames(res_linear$corr_th) %in% paste0("T", 1:5))
df = res_linear$corr_th[row_ind, col_ind]
df_p = data.frame(get_upper_tri(df)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))

p_secom1 = df_p %>%
  ggplot(aes(var1, var2, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, position = "top") +
  scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
  geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "SECOM (Pearson1)") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
        axis.text.y = element_text(size = 12),
        plot.title = element_text(hjust = 0.5),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = c(0.7, 0.8),
        legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5)) +
  coord_fixed()

p_secom1

4.22 Filtering

df = res_linear$corr_fl[row_ind, col_ind]
df_p = data.frame(get_upper_tri(df)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))

p_secom2 = df_p %>%
  ggplot(aes(var1, var2, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, position = "top") +
  scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
  geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "SECOM (Pearson2)") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
        axis.text.y = element_text(size = 12),
        plot.title = element_text(hjust = 0.5),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = c(0.7, 0.8),
        legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5)) +
  coord_fixed()

p_secom2

4.23 P-value

p_dense_secom1 = p_dens(res_linear$corr_p)
p_dense_secom1 = p_dense_secom1 + 
  labs(title = "SECOM (Pearson2)") +
  theme(plot.title = element_text(hjust = 0.5))

ggplotly(p_dense_secom1)

4.3 Distance correlation

4.31 Filtering

df = res_dist$dcorr_fl[row_ind, col_ind]
df_p = data.frame(get_upper_tri(df)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))

p_secom3 = df_p %>%
  ggplot(aes(var1, var2, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, position = "top") +
  scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
  geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "SECOM (Distance)") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
        axis.text.y = element_text(size = 12),
        plot.title = element_text(hjust = 0.5),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = c(0.7, 0.8),
        legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5)) +
  coord_fixed()

p_secom3

4.32 P-value

p_dense_secom2 = p_dens(res_dist$dcorr_p)
p_dense_secom2 = p_dense_secom2 + 
  labs(title = "SECOM (Distance)") +
  theme(plot.title = element_text(hjust = 0.5))

ggplotly(p_dense_secom2)

5. Outputs

p_main1 = ggarrange(ggmatrix_gtable(p_a), ggmatrix_gtable(p_o), 
                    ncol = 2, nrow = 1, labels = c("a", "b"))
p_main2 = ggarrange(p_sample, p_sparcc, p_secom3, ncol = 3,
                    labels = c("c", "d", "e"), common.legend = TRUE, 
                    legend = "bottom")
p_main = ggarrange(p_main1, p_main2, ncol = 1, nrow = 2)
ggsave(plot = p_main, "../images/main/illustrate.pdf", height = 12, width = 12)   
ggsave(plot = p_main, "../images/main/illustrate.jpeg", height = 12, width = 12, dpi = 300)

df_p = data.frame(p = corr_sample_p[upper.tri(corr_sample_p)], method = "Pearson") %>%
  bind_rows(
    data.frame(p = res_dist$dcorr_p[upper.tri(res_dist$dcorr_p)], method = "SECOM (Distance)")
  )
df_line = data.frame(xintercept = c(0.01, 0.05),
                     stringsAsFactors = FALSE)
p_supp = df_p %>%
  ggplot(aes(x = p, color = method)) +
  scale_color_npg(name = NULL) +
  stat_ecdf(geom = "point") +
  geom_vline(xintercept = 0.005, 
             linetype = "dashed", size = 1) +
  labs(x = "P-value", y = "Cumulative density") +
  scale_x_continuous(breaks = c(0.005, 0.25, 0.50, 0.75, 1.00)) +
  theme_bw()
p_supp

ggsave(plot = p_supp, "../images/supp/supp_p_value.pdf", height = 5, width = 6.25)   
ggsave(plot = p_supp, "../images/supp/supp_p_value.jpeg", height = 5, width = 6.25, dpi = 300)

Session information

sessionInfo()
R version 4.1.1 (2021-08-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19044)

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] energy_1.7-8      Hmisc_4.5-0       Formula_1.2-4     survival_3.2-11  
 [5] lattice_0.20-44   DescTools_0.99.43 doRNG_1.8.2       rngtools_1.5.2   
 [9] doParallel_1.0.16 iterators_1.0.13  foreach_1.5.1     ggsci_2.9        
[13] ggpubr_0.4.0      SpiecEasi_1.1.2   plotly_4.10.0     GGally_2.1.2     
[17] microbiome_1.14.0 phyloseq_1.36.0   forcats_0.5.1     stringr_1.4.0    
[21] dplyr_1.0.7       purrr_0.3.4       readr_2.0.2       tidyr_1.1.4      
[25] tibble_3.1.5      ggplot2_3.3.5     tidyverse_1.3.1  

loaded via a namespace (and not attached):
  [1] readxl_1.3.1           backports_1.2.1        VGAM_1.1-5            
  [4] plyr_1.8.6             igraph_1.2.6           lazyeval_0.2.2        
  [7] splines_4.1.1          crosstalk_1.1.1        GenomeInfoDb_1.28.4   
 [10] digest_0.6.28          htmltools_0.5.2        fansi_0.5.0           
 [13] checkmate_2.0.0        magrittr_2.0.1         cluster_2.1.2         
 [16] tzdb_0.1.2             openxlsx_4.2.4         Biostrings_2.60.2     
 [19] modelr_0.1.8           jpeg_0.1-9             colorspace_2.0-2      
 [22] rvest_1.0.1            haven_2.4.3            xfun_0.26             
 [25] crayon_1.4.1           RCurl_1.98-1.5         jsonlite_1.7.2        
 [28] Exact_3.0              ape_5.5                glue_1.4.2            
 [31] gtable_0.3.0           zlibbioc_1.38.0        XVector_0.32.0        
 [34] car_3.0-11             Rhdf5lib_1.14.2        shape_1.4.6           
 [37] BiocGenerics_0.38.0    abind_1.4-5            scales_1.1.1          
 [40] mvtnorm_1.1-2          DBI_1.1.1              rstatix_0.7.0         
 [43] Rcpp_1.0.7             htmlTable_2.2.1        viridisLite_0.4.0     
 [46] foreign_0.8-81         proxy_0.4-26           stats4_4.1.1          
 [49] glmnet_4.1-2           htmlwidgets_1.5.4      httr_1.4.2            
 [52] pulsar_0.3.7           RColorBrewer_1.1-2     ellipsis_0.3.2        
 [55] farver_2.1.0           pkgconfig_2.0.3        reshape_0.8.8         
 [58] nnet_7.3-16            dbplyr_2.1.1           utf8_1.2.2            
 [61] labeling_0.4.2         tidyselect_1.1.1       rlang_0.4.11          
 [64] reshape2_1.4.4         munsell_0.5.0          cellranger_1.1.0      
 [67] tools_4.1.1            cli_3.0.1              generics_0.1.0        
 [70] ade4_1.7-18            broom_0.7.9            evaluate_0.14         
 [73] biomformat_1.20.0      fastmap_1.1.0          yaml_2.2.1            
 [76] knitr_1.36             fs_1.5.0               zip_2.2.0             
 [79] rootSolve_1.8.2.3      nlme_3.1-153           xml2_1.3.2            
 [82] compiler_4.1.1         rstudioapi_0.13        png_0.1-7             
 [85] curl_4.3.2             e1071_1.7-9            ggsignif_0.6.3        
 [88] reprex_2.0.1           huge_1.3.5             gsl_2.1-7             
 [91] stringi_1.7.5          highr_0.9              Matrix_1.3-4          
 [94] vegan_2.5-7            permute_0.9-5          multtest_2.48.0       
 [97] vctrs_0.3.8            pillar_1.6.3           lifecycle_1.0.1       
[100] rhdf5filters_1.4.0     jquerylib_0.1.4        cowplot_1.1.1         
[103] data.table_1.14.2      bitops_1.0-7           lmom_2.8              
[106] latticeExtra_0.6-29    R6_2.5.1               gridExtra_2.3         
[109] rio_0.5.27             IRanges_2.26.0         gld_2.6.2             
[112] codetools_0.2-18       boot_1.3-28            MASS_7.3-54           
[115] assertthat_0.2.1       rhdf5_2.36.0           withr_2.4.2           
[118] S4Vectors_0.30.2       GenomeInfoDbData_1.2.6 mgcv_1.8-37           
[121] expm_0.999-6           hms_1.1.1              rpart_4.1-15          
[124] grid_4.1.1             class_7.3-19           rmarkdown_2.11        
[127] carData_3.0-4          Rtsne_0.15             base64enc_0.1-3       
[130] Biobase_2.52.0         lubridate_1.7.10